About this file

This Markdown file is for the report exercise of Chapter 9 (Supervised Machine learning I) of the AGDS Course. The exercise mostly centers around k-nearest-neighbors models, how they compare to linear regression when predicting GPP data by splitting data into training and test-datasets, and what role the k (number of neighbors) has on the models and how tuning that number influences modelling. This is done in part through visualization and metrics like RMSE and MAE to calculate the goodness of the models and analyze their bias-variance tradeoff.

daily_fluxes <-  readr::read_csv("./data/FLX_CH-Dav_FLUXNET2015_FULLSET_DD_1997-2014_1-3.csv") |>

 # select only the variables we are interested in
  dplyr::select(TIMESTAMP,
                GPP_NT_VUT_REF,    # the target
                ends_with("_QC"),  # quality control info
                ends_with("_F"),   # includes all all meteorological covariates
                -contains("JSB")   # weird useless variable
                ) |>

  # convert to a nice date object
  dplyr::mutate(TIMESTAMP = ymd(TIMESTAMP)) |>

  # set all -9999 to NA
  dplyr::mutate(across(where(is.numeric), ~na_if(., -9999))) |> 
  
  # retain only data based on >=80% good-quality measurements
  # overwrite bad data with NA (not dropping rows)
  dplyr::mutate(GPP_NT_VUT_REF = ifelse(NEE_VUT_REF_QC < 0.8, NA, GPP_NT_VUT_REF),
                TA_F           = ifelse(TA_F_QC        < 0.8, NA, TA_F),
                SW_IN_F        = ifelse(SW_IN_F_QC     < 0.8, NA, SW_IN_F),
                LW_IN_F        = ifelse(LW_IN_F_QC     < 0.8, NA, LW_IN_F),
                VPD_F          = ifelse(VPD_F_QC       < 0.8, NA, VPD_F),
                PA_F           = ifelse(PA_F_QC        < 0.8, NA, PA_F),
                P_F            = ifelse(P_F_QC         < 0.8, NA, P_F),
                WS_F           = ifelse(WS_F_QC        < 0.8, NA, WS_F)) |> 

  # drop QC variables (no longer needed)
  dplyr::select(-ends_with("_QC"))

Comparison of the linear regression and KNN models

1: evaluating KNN and linear regression

Adopt the code from the Chapter for fitting and evaluating the linear regression model and the KNN

Ctrl+C, Ctrl+V from the chapter and clean up some issues

# Data splitting
set.seed(1982)  
daily_fluxes <- daily_fluxes[!(colnames(daily_fluxes)) == "LW_IN_F"]
#due to the high amount of NA values in LW_IN_F, and the according to the script is a priori not critical for GPP predictions, we drop this variable

split <- rsample::initial_split(daily_fluxes, prop = 0.7, strata = "VPD_F")
daily_fluxes_train <- rsample::training(split)

daily_fluxes_test <- rsample::testing(split)

# Model and pre-processing formulation, use all variables but LW_IN_F
pp <- recipes::recipe(GPP_NT_VUT_REF ~ SW_IN_F + VPD_F + TA_F, 
                      data = daily_fluxes_train |> drop_na()) |> 
  recipes::step_BoxCox(all_predictors()) |> 
  recipes::step_center(all_numeric(), -all_outcomes()) |>
  recipes::step_scale(all_numeric(), -all_outcomes())

# Fit linear regression model
mod_lm <- caret::train(
  pp, 
  data = daily_fluxes_train |> drop_na(), 
  method = "lm",
  trControl = caret::trainControl(method = "none"),
  metric = "RMSE"
)
summary(mod_lm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1728 -1.0709 -0.1613  0.9008  7.3409 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.21581    0.02446  131.47   <2e-16 ***
## SW_IN_F      1.16519    0.03297   35.34   <2e-16 ***
## VPD_F       -0.81775    0.03631  -22.52   <2e-16 ***
## TA_F         1.93384    0.03411   56.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.593 on 4238 degrees of freedom
## Multiple R-squared:  0.6645, Adjusted R-squared:  0.6642 
## F-statistic:  2798 on 3 and 4238 DF,  p-value: < 2.2e-16
# Fit KNN model
mod_knn <- caret::train(
  pp, 
  data = daily_fluxes_train |> drop_na(), 
  method = "knn",
  trControl = caret::trainControl(method = "none"),
  tuneGrid = data.frame(k = 8),
  metric = "RMSE"
)
#
source("./eval_model.R")

#linear regression model
eval_model(mod = mod_lm, df_train = daily_fluxes_train, df_test = daily_fluxes_test)

figure 1: Evaluation of linear regression model fit vs. GPP measurements.

# KNN 
 eval_model(mod = mod_knn, df_train = daily_fluxes_train, df_test = daily_fluxes_test)

figure 2: Evaluation of k-nearest-neighbor model fit vs. GPP measurements.

#Do the thing that is within the function in eval_model so I can have the fits for the other exercises, since for reasons I do not comprehend due to my weak understanding of programming, the fits simply do not get appended to the dataframes when simply running eval_model (and maybe that's normal).


#for the training data
daily_fluxes_train <- daily_fluxes_train |> 
    drop_na()

daily_fluxes_train$lm_fitted <- predict(mod_lm, newdata = daily_fluxes_train)

  daily_fluxes_train$knn_fitted <- predict(mod_knn, newdata = daily_fluxes_train)

#for the test data
daily_fluxes_test <- daily_fluxes_test |> 
    drop_na()

daily_fluxes_test$lm_fitted <- predict(mod_lm, newdata = daily_fluxes_test)


daily_fluxes_test$knn_fitted <- predict(mod_knn, newdata = daily_fluxes_test)
  


# combining the training and test data to prepare for ex.3  
  
lm_fit_df <- data.frame(
  c(daily_fluxes_train$TIMESTAMP, daily_fluxes_test$TIMESTAMP), 
  c(daily_fluxes_train$lm_fitted, daily_fluxes_test$lm_fitted))

names(lm_fit_df) <- c("TIMESTAMP", "fitted")


KNN_fit_df <- data.frame(
  c(daily_fluxes_train$TIMESTAMP, daily_fluxes_test$TIMESTAMP), 
  c(daily_fluxes_train$knn_fitted, daily_fluxes_test$knn_fitted))

names(KNN_fit_df) <- c("TIMESTAMP", "fitted")

2: Interpret observed differences in the context of the bias-variance trade-off

2.1 Why is the difference between the evaluation on the training and the test set larger for the KNN model than for the linear regression model?

The KNN model could be slightly over- or underfitting for the training data, meaning when applying the model that works for the training data onto the testing data, it is a worse fit due to the slight “wiggle” that the line has. The lm on the other hand is just a fully straight line, which means it doesn’t fit as well in general but at the same time differences between training and testing data do not influence the RMSE much.

2.2 Why is the does the evaluation on the test set indicate a better model performance of the KNN model than the linear regression model?

Because it is more flexible than “just a straight line” which is the lm.

2.3 How would you position the KNN and the linear regression model along the spectrum of the bias-variance trade-off?

Linear Regressions have a high bias, as they are not affected by the noise in observations, but its predictions are further away from the observations (just like model poly1 in the chapter) because it oversimplifies the model. KNN can be low bias and high variance when using a k which is too small, or high bias and low variance when using a k which is too big.

3 Visualizing

Visualise temporal variations of observed and modelled GPP for both models, covering all available dates.

colors1 <- c("GPP Measured" = "black", "Linear fit" = "green", "KNN fit" = "red")


ggplot() +
  geom_point(data = daily_fluxes, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF,  color = "GPP Measured"), alpha = 0.8) +
  geom_line(data = lm_fit_df, aes(x = TIMESTAMP, y = fitted,  color = "Linear fit"), alpha = 0.5) +
  geom_line(data = KNN_fit_df, aes(x = TIMESTAMP, y = fitted,  color = "KNN fit"), alpha = 0.5) +

    scale_color_manual(values = colors1) +
   
  labs(x = "Year", y = "GPP") +
  
  theme(axis.text = element_text(size = 20), axis.title = element_text(size = 30)) + 
  
  theme(legend.key.size = unit(2, "cm"), 
    legend.text = element_text(size = 20), legend.title = element_blank())

figure 3: temporal variations of observed (black) and modelled GPP with KNN (red) and linear regression (green) for the period 1997-2014.

ggplot() +
      geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF),  color = 'red', alpha = 0.5) +
#  geom_line(data = lm_model[[2]], aes(x = TIMESTAMP, y = fitted),  color = 'green') +
  geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = knn_fitted),  color = 'blue', alpha = 0.5)

ggplot() +
      geom_point(data = daily_fluxes_test, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF),  color = 'red', alpha = 0.5) +
  geom_point(data = daily_fluxes_test, aes(x = TIMESTAMP, y = knn_fitted),  color = 'blue', alpha = 0.5)

These plots are done to compare test and train model fits with the actual GPP measurements. With k = 1 it’s very visible how GPP and fit are exactly the same for the train graphic (except the few points where GPP could not be predicted due to NA values) but when comparing it to the test dataset, it is all over the place, signifying an overfitting onto the training dataset and thus low generalization of the fit.

The role of k

1 R2 and MAE depending on k

Based on your understanding of KNN (and without running code), state a hypothesis for how the R2 and the MAE evaluated on the test and on the training set would change for k approaching 1 and for k approaching N (the number of observations in the data). Explain your hypothesis, referring to the bias-variance trade-off.

k = 1 would mean that the line would run through all the training observation, leading to an R2 of 1.00 for the training data, but it would very likely have a bad R2 as well as mean error when applying to the testing dataset (overfitting, high variance).

k = N means a too generalized model, which should perform somewhat poorly in both R2 and MAE, and values for training and test models should be close each other (underfitting, high bias)

2 Test different values for k

Write code that splits the data into a training and a test set and repeats model fitting and evaluation for different values for k. Visualise results, showing model generalisability as a function of model complexity.

#first the loop
plots <- list()
moreplots <- list()
different_ks <- c(1, 2, 5, 8, 15, 50, 150, 300)
knn_diff_k <- list()
all_models <- list()

for (i in 1:length(different_ks)){
  knn_diff_k[[i]] <- caret::train(
    pp, 
    data = daily_fluxes_train |> drop_na(), 
    method = "knn",
    trControl = caret::trainControl(method = "none"),
    tuneGrid = data.frame(k = different_ks[i]),
    metric = "RMSE")
  all_models[[i]] <- eval_model(knn_diff_k[[i]], daily_fluxes_train, daily_fluxes_test) 
  
  daily_fluxes_train$knn_fitted <- predict(knn_diff_k[[i]], newdata = daily_fluxes_train)
  
  plots[[i]] <- ggdraw(all_models[[i]]) + draw_label(paste("k =", different_ks[i]), x = 0.45, y = 0.96, color = 'red')

  moreplots[[i]] <- ggplot() +
      geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = GPP_NT_VUT_REF),  color = 'red', alpha = 0.5) +
  geom_point(data = daily_fluxes_train, aes(x = TIMESTAMP, y = knn_fitted),  color = 'blue', alpha = 0.5) +
    labs(x = "year", y = "GPP")
    
}
source("./plot_helper.R")
plots_draw(plots)

plots_draw(moreplots)

figure 4: GPP vs. model fit and evaluation for 8 different values for k

#plots 
 
moreplots
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

#also cowplot still does not work this confuses me a lot



#function to determine MAE based on k 
compute_MAE <- function(k){
  knn_diff_k <- caret::train(
    pp, 
    data = daily_fluxes_train |> drop_na(), 
    method = "knn",
    trControl = caret::trainControl(method = "none"),
    tuneGrid = data.frame(k = k),
    metric = "RMSE")
  
  daily_fluxes_test$fitted <- predict(knn_diff_k, newdata = daily_fluxes_test)
  
  metrics_test <- daily_fluxes_test |> 
    yardstick::metrics(GPP_NT_VUT_REF, fitted)
  
  MAE_knn_test <- metrics_test |> 
    filter(.metric == "mae") |> 
    pull(.estimate)
  
  return(MAE_knn_test)
}

for (i in 1:length(different_ks)){
  print(compute_MAE(different_ks[i]))

}
## [1] 1.495793
## [1] 1.303756
## [1] 1.175692
## [1] 1.147647
## [1] 1.123898
## [1] 1.111188
## [1] 1.128479
## [1] 1.153778

3 optimal k

Is there an “optimal” k in terms of model generalisability? Edit your code to determine an optimal k.

Just like in the previous exercise, we use the Mean absolute error (MAE) on the test dataset, determining it for every between 0 and 100.

all_MAE <- c()

for (i in 1:100){
  all_MAE[i] <- compute_MAE(i)
  
}

min(all_MAE)
## [1] 1.104253
which.min(all_MAE)
## [1] 32
#all_MAE <- data.frame(all_MAE)

ggplot(
  data = data.frame(all_MAE), aes(x = 1:100, y = all_MAE)) + 
  geom_point() +
  geom_point(aes(x = which.min(all_MAE),
                 y =  min(all_MAE), color = 'red')) + 
  theme(legend.position = "none") +
  labs(y = "Mean absolute error", x = "k")

Using the MAE as a metric and the test-datasets for various k, it seems as k = 32 is the optimal value for highest generalisability, as it has the lowest Mean Absolute Error for the test data. However, it is hard to come to a simple conclusion using just this somewhat rigid dataset without any re-sampling or other metric.